Import data

stroke_df = read.csv("./data/healthcare-dataset-stroke-data.csv")
# head(stroke_df)

EDA

EDA and Visualization

Distribution of stroke:

dataplot10 = stroke_df %>% dplyr::count(stroke) 
dataplot1 = dataplot10 %>% mutate(ntotal=sum(dataplot10$n), perc= n/ntotal)
plot1= ggplot(dataplot1, aes(x="", y=perc*100, fill=as.factor(stroke), group=as.factor(stroke)))+theme_bw()+
  geom_bar(width = 1, stat = "identity") + theme_void() +
  labs(x=" ",y=" ", fill=" ") + 
  scale_fill_brewer(palette = "Dark2",labels = c("No stroke", "Stroke"))+
  geom_text( y=55, label="95.13 %", size=5)+geom_text(aes(label="4.87 %"),y=2.5, x=1.3, size=4)+
  coord_polar("y", start=0) + theme(legend.text=element_text(size=15))

plot1

We could see that only 4.87% of the 5110 individuals in the dataset suffered a stroke.

#genders

dataplot2=stroke_df %>% dplyr::count(stroke, gender) %>% spread(stroke, n)
names(dataplot2)=c("gender", "neg", "pos")

dataplot2 = dataplot2 %>% mutate(perc_gender=pos/(pos+neg))

plot2 = ggplot(dataplot2 %>% filter(gender!="Other"), aes(x=gender,
                        y=perc_gender*100, fill=as.factor(gender), 
                        group=as.factor(gender))) + theme_bw()+
  geom_bar(stat = "identity")+
  labs(title="Gender",x="",y="Probability of stroke (%)") + scale_fill_brewer(palette = "Dark2") +
  theme(legend.position = "none")+  theme(text = element_text(size=13.07,colour="black"))+
  theme(axis.text.x = element_text(colour="black",size=13.07))+
  theme(axis.text.y = element_text(colour="black",size=13.07))

Smoking status:

dataplot3_1=stroke_df %>% dplyr::count(stroke, smoking_status) %>% spread(stroke, n)
names(dataplot3_1)=c("smoking_status", "neg", "pos")

dataplot3_1 = dataplot3_1 %>% mutate(perc_smoke=pos/(pos+neg))

plot3 = ggplot(dataplot3_1, aes(x=smoking_status,
                        y=perc_smoke*100, fill=as.factor(smoking_status), 
                        group=as.factor(smoking_status))) + theme_bw()+
  geom_bar(stat = "identity")+
  labs(title="Smoking status",x=" ",y="Probability of stroke (%)") + scale_fill_brewer(palette = "Dark2") +
  scale_x_discrete(labels=c("formerly smoked" = "Formerly smoked", "never smoked" = "Never smoked", "smokes"="Smokes")) +
  theme(legend.position = "none")+  theme(text = element_text(size=13.07,colour="black"))+
  theme(axis.text.x = element_text(colour="black",size=13.07))+
  theme(axis.text.y = element_text(colour="black",size=13.07))

People who identified as former smokers have the highest probability of having a stroke (~8%), followed by smokers and then people who never smoked.

# hypertension

dataplot3_1a=stroke_df %>% dplyr::count(stroke, hypertension) %>% spread(stroke, n)
names(dataplot3_1a)=c("hypertension", "neg", "pos")

dataplot3_1a = dataplot3_1a %>% mutate(perc_hyp=pos/(pos+neg))

plot4 =ggplot(dataplot3_1a, aes(x=as.factor(hypertension) ,
                        y=perc_hyp*100, fill=as.factor(hypertension ), 
                        group=as.factor(hypertension ))) + theme_bw()+
  geom_bar(stat = "identity")+
  labs(title="Hypertension", x=" ",y="Probability of stroke (%)", fill=" ") +
  scale_fill_brewer(palette = "Dark2") +
  scale_x_discrete(breaks=c("0","1"), labels=c("0" = "No hypertension", "1" = "Hypertension")) + theme(legend.position = "none")+
  theme(text = element_text(size=13.07,colour="black"))+
  theme(axis.text.x = element_text(colour="black",size=13.07))+
  theme(axis.text.y = element_text(colour="black",size=13.07))
#heart disease
dataplot3_1b=stroke_df %>% dplyr::count(stroke, heart_disease) %>% spread(stroke, n)
names(dataplot3_1b)=c("heart_disease", "neg", "pos")

dataplot3_1b = dataplot3_1b %>% mutate(perc_hd=pos/(pos+neg))

plot5 = ggplot(dataplot3_1b, aes(x=as.factor(heart_disease) ,
                         y=perc_hd*100, fill=as.factor(heart_disease ), 
                         group=as.factor(heart_disease ))) + theme_bw()+
  geom_bar(stat = "identity")+
  labs(title="Heart disease",x="", y="Probability of stroke (%)", fill="Heart disease") +
  scale_fill_brewer(palette = "Dark2") + 
  scale_x_discrete(breaks=c("0","1"), labels=c("0" = "No HD", "1" = "HD")) + theme(legend.position = "none")+
  theme(text = element_text(size=13.07,colour="black"))+
  theme(axis.text.x = element_text(colour="black",size=13.07))+
  theme(axis.text.y = element_text(colour="black",size=13.07))
#ever_married
dataplot3_1c=stroke_df %>% dplyr::count(stroke, ever_married) %>% spread(stroke, n)
names(dataplot3_1c)=c("ever_married", "neg", "pos")

dataplot3_1c = dataplot3_1c %>% mutate(perc_em=pos/(pos+neg))

plot6 = ggplot(dataplot3_1c, aes(x=ever_married ,
                         y=perc_em*100, fill=as.factor(ever_married ), 
                         group=as.factor(ever_married ))) + theme_bw()+
  geom_bar(stat = "identity")+
  labs(title="Ever married",x="", y="Probability of stroke (%)", fill=" ") +
  scale_fill_brewer(palette = "Dark2") +
  theme(legend.position = "none")+ 
  theme(text = element_text(size=13.07,colour="black"))+
  theme(axis.text.x = element_text(colour="black",size=13.07))+
  theme(axis.text.y = element_text(colour="black",size=13.07))
# work type

dataplot3_1d= stroke_df %>% dplyr::count(stroke, work_type) %>% spread(stroke, n)
names(dataplot3_1d)=c("work_type", "neg", "pos")

dataplot3_1d = dataplot3_1d %>% mutate(perc_wt=pos/(pos+neg))

plot7=ggplot(dataplot3_1d %>% filter(work_type!="Never_worked"), aes(x=work_type ,
                         y=perc_wt*100, fill=as.factor(work_type ), 
                         group=as.factor(work_type ))) + theme_bw()+
  geom_bar(stat = "identity")+
  labs(title="Work type",x=" ",y="Probability of stroke (%)", fill=" ") +
  scale_fill_brewer(palette = "Dark2") + 
  scale_x_discrete(labels=c("children" = "Children", "Govt_job" = "Gov. Job")) +
  theme(legend.position = "none")+
  theme(text = element_text(size=13.07,colour="black"))+
  theme(axis.text.x = element_text(colour="black",size=13.07))+
  theme(axis.text.y = element_text(colour="black",size=13.07))
#residence type

dataplot3_1e=stroke_df %>% dplyr::count(stroke, Residence_type) %>% spread(stroke, n)
names(dataplot3_1e)=c("Residence_type", "neg", "pos")

dataplot3_1e = dataplot3_1e %>% mutate(perc_rt=pos/(pos+neg))

plot8 = ggplot(dataplot3_1e, aes(x=Residence_type ,
                         y=perc_rt*100, fill=as.factor(Residence_type ), 
                         group=as.factor(Residence_type ))) + theme_bw()+
  geom_bar(stat = "identity")+
  labs(title="Residence type",x=" ", y="Probability of stroke (%)", fill=" ") +
  scale_fill_brewer(palette = "Dark2") +
  theme(legend.position = "none")+
  theme(text = element_text(size=13.07,colour="black"))+
  theme(axis.text.x = element_text(colour="black",size=13.07))+
  theme(axis.text.y = element_text(colour="black",size=13.07))

Categorical variables:

#figures
allplotslist_1 <- align_plots(plot2, plot3, plot4, plot5, plot6, plot7, plot8, align = "hv")


grid_1=grid.arrange(allplotslist_1[[1]],allplotslist_1[[2]],
                  allplotslist_1[[3]],allplotslist_1[[4]],
                  allplotslist_1[[5]], allplotslist_1[[6]],nrow = 3)

Continuous variable:

#age
plot9 = 
  stroke_df %>% 
  ggplot() + 
  geom_density(aes(x=age  , group=as.factor(stroke),fill=as.factor(stroke)),
               size=1,alpha=0.5, adjust=2)  + 
  theme_bw()+
  ylab("Density")+ labs(fill=' ',x="Age") +   
  scale_fill_brewer(palette = "Dark2",labels = c("No stroke", "Stroke"))+
  theme(text = element_text(size=13.07,colour="black"))+
  theme(axis.text.x = element_text(colour="black",size=13.07))+
  theme(axis.text.y = element_text(colour="black",size=13.07))



# bmi
plot10 = 
  stroke_df %>% 
  ggplot() + 
  geom_density(aes(x=bmi, group=as.factor(stroke),fill=as.factor(stroke)),
               size=1,alpha=0.5, adjust=2)  + 
  theme_bw()+
  ylab("Density")+ labs(fill=' ',x="BMI") +   
  scale_fill_brewer(palette = "Dark2",labels = c("No stroke", "Stroke"))+
  theme(text = element_text(size=13.07,colour="black"))+
  theme(axis.text.x = element_text(colour="black",size=13.07))+
  theme(axis.text.y = element_text(colour="black",size=13.07))



# avg_glucose_level
plot11 = 
  stroke_df %>% 
  ggplot() + 
  geom_density(aes(x=avg_glucose_level  , group=as.factor(stroke),fill=as.factor(stroke)),
               size=1,alpha=0.5, adjust=2)  + 
  theme_bw()+
  ylab("Density")+ labs(fill=' ',x="Avg. glucose level") +   
  scale_fill_brewer(palette = "Dark2",labels = c("No stroke", "Stroke"))+
  theme(text = element_text(size=13.07,colour="black"))+
  theme(axis.text.x = element_text(colour="black",size=13.07))+
  theme(axis.text.y = element_text(colour="black",size=13.07))

#combine plots

allplotslist_2 <- align_plots(plot9, plot10, plot11, align = "hv")

grid_3=grid.arrange(allplotslist_2[[1]],allplotslist_2[[2]],
                  allplotslist_2[[3]],ncol = 3)

Comment: From these plots we can see that:

Formerly smokers are more prone to suffer a stroke than smokers. This could be due to the fact that former smokers quit after acquiring health conditions that raised their risk of having a stroke.

Self-employed are under higher risk of suffering a stroke than private and government jobs. Maybe due to higher stress and lack of insurance that are results of being self-employed?

Urban residents, males and people with hypertension or heart disease are prone to suffer a stroke. In addition, people who have been married are also more likely to suffer a stroke than the single people.

Age seems to be an important factor, with higher age comes higher chance of having a stroke. There are far more people who developed a stroke that have high glucose level than people with low glucose level.

Change categorical variables to binary for model training

stroke_df$stroke = as.factor(stroke_df$stroke)
stroke_df$gender = factor(stroke_df$gender) %>% as.numeric()
stroke_df$ever_married = factor(stroke_df$ever_married) %>% as.numeric()
stroke_df$work_type = factor(stroke_df$work_type) %>% as.numeric()
stroke_df$Residence_type = factor(stroke_df$Residence_type) %>% as.numeric()
stroke_df$smoking_status = factor(stroke_df$smoking_status) %>% as.numeric()
stroke_df$heart_disease = factor(stroke_df$heart_disease) %>% as.numeric()
stroke_df$hypertension = as.numeric(factor(stroke_df$hypertension))
stroke_df$work_type = as.factor(stroke_df$work_type) %>% as.numeric()
stroke_df$bmi = as.numeric(stroke_df$bmi)
## Warning: NAs introduced by coercion
stroke_df = stroke_df[, -1] %>% 
    mutate(stroke = recode(stroke, 
                           `0` = "No", 
                           `1` = "Yes"), 
           stroke = factor(stroke)) %>% 
    filter(gender < 3) 


summary(stroke_df)
##      gender           age         hypertension   heart_disease  
##  Min.   :1.000   Min.   : 0.08   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:25.00   1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000   Median :45.00   Median :1.000   Median :1.000  
##  Mean   :1.414   Mean   :43.23   Mean   :1.097   Mean   :1.054  
##  3rd Qu.:2.000   3rd Qu.:61.00   3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :2.000   Max.   :82.00   Max.   :2.000   Max.   :2.000  
##                                                                 
##   ever_married     work_type     Residence_type  avg_glucose_level
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   : 55.12   
##  1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.: 77.24   
##  Median :2.000   Median :4.000   Median :2.000   Median : 91.88   
##  Mean   :1.656   Mean   :3.495   Mean   :1.508   Mean   :106.14   
##  3rd Qu.:2.000   3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:114.09   
##  Max.   :2.000   Max.   :5.000   Max.   :2.000   Max.   :271.74   
##                                                                   
##       bmi        smoking_status  stroke    
##  Min.   :10.30   Min.   :1.000   No :4860  
##  1st Qu.:23.50   1st Qu.:2.000   Yes: 249  
##  Median :28.10   Median :2.000             
##  Mean   :28.89   Mean   :2.586             
##  3rd Qu.:33.10   3rd Qu.:4.000             
##  Max.   :97.60   Max.   :4.000             
##  NA's   :201
vis_miss(stroke_df)

head(stroke_df)
##   gender age hypertension heart_disease ever_married work_type Residence_type
## 1      2  67            1             2            2         4              2
## 2      1  61            1             1            2         5              1
## 3      2  80            1             2            2         4              1
## 4      1  49            1             1            2         4              2
## 5      1  79            2             1            2         5              1
## 6      2  81            1             1            2         4              2
##   avg_glucose_level  bmi smoking_status stroke
## 1            228.69 36.6              1    Yes
## 2            202.21   NA              2    Yes
## 3            105.92 32.5              2    Yes
## 4            171.23 34.4              3    Yes
## 5            174.12 24.0              2    Yes
## 6            186.21 29.0              1    Yes

Partition the dataset

set.seed(123)
trRow = createDataPartition(y = stroke_df$stroke, p = 0.7, list = F)
train.data = stroke_df[trRow, ]
test.data = stroke_df[-trRow, ] 

Imputation with preProcess()

knnImp = preProcess(train.data, method = "knnImpute", k = 3)
train.data = predict(knnImp, train.data)
vis_miss(train.data)

test.data = predict(knnImp,test.data)
vis_miss(test.data)

Models

Try following models to see which algorithm fits the best because our outcome is binary and it would better to proceed with which classification performs the best. We will have accuracy and ROC/AUC as our evaluation metrics.

Logistic regression

GLM

glm.fit <- glm(stroke ~ ., 
               data = stroke_df, 
               subset = trRow, 
               family = binomial(link = "logit"))
               
summary(glm.fit)

#prediction
test.pred.prob <- predict(glm.fit, newdata = test.data,
                           type = "response")
                           

test.pred <- rep("No", length(test.pred.prob))
test.pred[test.pred.prob>0.5] <- "Yes"
head(test.pred)

# 2x2 table
confusionMatrix(data = as.factor(test.pred),
                reference = stroke_df$stroke[-trRow],
                positive = "Yes")

roc.glm <- roc(stroke_df$stroke[-trRow], test.pred.prob)
plot(roc.glm, legacy.axes = TRUE, print.auc = TRUE)
plot(smooth(roc.glm), col = 4, add = TRUE)

# Accuracy = 0.9517 = proportion of corrected class = (TN + TP) / total , 95% CI = (0.9397, 0.9619)

# NIR = 0.9517 = maximum (observed negative rate, observed positive rate). Extreme case: all observed y is 0 (neg)=> NIR = 1. Extremely unbalanced class => NIR is very high. One of the class has much larger probability (in this case negative class), then come up with a very simple classifier (assign all class label to be negative) then it makes negative prediction to all obs, but accuracy is very high still because it still can make prediction right . Accuracy is not meaningful for highly imbalanced data.

# p-value = 0.5309. Kappa = 0. Null hypo: Accuracy = NIR. pvalue[Acc > NIR] = 0.5309 >>> 0.05, fail to reject that Accuracy is better than NIR. 

# Kappa = measures agreement between observed labels and predictor labels. Can account for probability of agreement by chance. Here Kappa = 0 => 
# Using caret
ctrl <- trainControl(method = "cv",
                     summaryFunction = twoClassSummary,
                     classProbs = TRUE)
set.seed(1)
model.glm <- train(x = train.data[, c(1:10)],
                   y = train.data$stroke,
                   method = "glm",
                   metric = "ROC",
                   trControl = ctrl)

glm.pred = predict(model.glm, newdata = test.data, type = "prob")

glm.prob = ifelse(glm.pred$Yes > 0.5,  "Yes", "No")

confusionMatrix(data = as.factor(glm.prob),
                reference = test.data$stroke,
                positive = "Yes")
## Warning in confusionMatrix.default(data = as.factor(glm.prob), reference =
## test.data$stroke, : Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1458   74
##        Yes    0    0
##                                           
##                Accuracy : 0.9517          
##                  95% CI : (0.9397, 0.9619)
##     No Information Rate : 0.9517          
##     P-Value [Acc > NIR] : 0.5309          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.9517          
##              Prevalence : 0.0483          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Yes             
## 
roc.glm = roc(test.data$stroke, glm.pred[,2])
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
auc.glm = roc.glm$auc[1]
auc.glm
## [1] 0.8423516
plot(roc.glm, legacy.axes = TRUE, print.auc = TRUE)
plot(smooth(roc.glm), col = 4, add = TRUE)

KNN

## 161-nearest neighbor model
## Training set outcome distribution:
## 
##   No  Yes 
## 3402  175
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1458   74
##        Yes    0    0
##                                           
##                Accuracy : 0.9517          
##                  95% CI : (0.9397, 0.9619)
##     No Information Rate : 0.9517          
##     P-Value [Acc > NIR] : 0.5309          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.9517          
##              Prevalence : 0.0483          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Yes             
## 
## [1] 0.8195927

GAM

set.seed(1)

model.gam <- train(x = train.data[,c(1:11)],
                   y = train.data$stroke,
                   method = "gam",
                   metric = "ROC",
                   trControl = ctrl)


model.gam$finalModel
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## .outcome ~ gender + hypertension + heart_disease + ever_married + 
##     Residence_type + smoking_status + work_type + s(age) + s(bmi) + 
##     s(avg_glucose_level)
## 
## Estimated degrees of freedom:
## 3.7044 0.0017 0.8089  total = 12.51 
## 
## UBRE score: -0.6829292
gam.pred = predict(model.gam, newdata = test.data, type = "prob")

gam.prob = ifelse(gam.pred$Yes > 0.5,  "Yes", "No")

confusionMatrix(data = as.factor(gam.prob),
                reference = test.data$stroke,
                positive = "Yes")
## Warning in confusionMatrix.default(data = as.factor(gam.prob), reference =
## test.data$stroke, : Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1458   74
##        Yes    0    0
##                                           
##                Accuracy : 0.9517          
##                  95% CI : (0.9397, 0.9619)
##     No Information Rate : 0.9517          
##     P-Value [Acc > NIR] : 0.5309          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.9517          
##              Prevalence : 0.0483          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Yes             
## 
roc.gam = roc(test.data$stroke, gam.pred[,2])
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
auc.gam = roc.gam$auc[1]
auc.gam
## [1] 0.8431858
plot(roc.gam, legacy.axes = TRUE, print.auc = TRUE)
plot(smooth(roc.gam), col = 4, add = TRUE)

LDA

partimat(stroke ~ avg_glucose_level + age + bmi,
         data = stroke_df, subset = trRow, method = "lda")

QDA

Naive Bayes

Classification trees

Logistic regression assumes that the data is linearly separable in space but decision trees do not. Decision trees also handle skewed data better.

Compare models

# based on cv

res <- resamples(list(GLM = model.glm,  
                      GAM = model.gam,
                      KNN = model.knn))
summary(res)
## 
## Call:
## summary.resamples(object = res)
## 
## Models: GLM, GAM, KNN 
## Number of resamples: 10 
## 
## ROC 
##          Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## GLM 0.7294118 0.8085310 0.8422962 0.8354419 0.8775695 0.8987889    0
## GAM 0.7158497 0.8104623 0.8460400 0.8346551 0.8759454 0.8959150    0
## KNN 0.6936275 0.7876946 0.8112697 0.8020851 0.8266307 0.8522320    0
## 
## Sens 
##     Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GLM    1       1      1    1       1    1    0
## GAM    1       1      1    1       1    1    0
## KNN    1       1      1    1       1    1    0
## 
## Spec 
##     Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GLM    0       0      0    0       0    0    0
## GAM    0       0      0    0       0    0    0
## KNN    0       0      0    0       0    0    0
bwplot(res, metric = c("ROC", "AUC"))

# GLM and GAM perform better compared to KNN. KNN usually requires a larger dataset to perform as good as model with symmeratric structure. 
#1st column is probability of negative, 2nd is positive
glm.pred <- predict(model.glm, newdata = test.data, type = "prob")[,2]
knn.pred <- predict(model.knn, newdata = test.data, type = "prob")[,2]
gam.pred <- predict(model.gam, newdata = test.data, type = "prob")[,2]


roc.glm <- roc(test.data$stroke, glm.pred)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
roc.knn <- roc(test.data$stroke, knn.pred)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
roc.gam <- roc(test.data$stroke, gam.pred)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
auc <- c(roc.glm$auc[1], roc.knn$auc[1],
         roc.gam$auc[1])

plot(roc.glm, legacy.axes = TRUE)
plot(roc.knn, col = 2, add = TRUE)
plot(roc.gam, col = 3, add = TRUE)


modelNames <- c("glm","knn","gam")
legend("bottomright", legend = paste0(modelNames, ": ", round(auc,3)),
       col = 1:3, lwd = 2)